STACAS vs other integration methods on heterogeneous T cell datasets
Introduction
Here we will compare integration results of STACAS with other R packages on a collection of scRNA-seq datasets of mouse tumor-infiltrating T cells from multiple studies from Andreatta et al
The data are available at: figshare/23136746
R environment
Get and load some useful packages
renv::restore()
if (!require("remotes", quietly = TRUE))
install.packages("remotes")
library(remotes)
if (!requireNamespace("STACAS", quietly = TRUE))
remotes::install_github("carmonalab/STACAS")
if (!requireNamespace("STACAS", quietly = TRUE))
remotes::install_github("carmonalab/scGate")
if (!requireNamespace("harmony", quietly = TRUE))
remotes::install_github("immunogenomics/harmony", ref="6866e01") #fix version at Sep 8, 2022
if (!requireNamespace("SeuratWrappers", quietly = TRUE))
remotes::install_github('satijalab/seurat-wrappers')
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!require("batchelor", quietly = TRUE))
BiocManager::install("batchelor")
if (!require("tidyr", quietly = TRUE))
install.packages("tidyr")
if (!require("tidytext", quietly = TRUE))
install.packages("tidytext")
if (!require("scIntegrationMetrics", quietly = TRUE))
install_github("carmonalab/scIntegrationMetrics") #calculates LISI and Silhouettelibrary(Seurat)
library(dplyr)
library(ggplot2)
library(STACAS)
library(scGate)
library(harmony)
library(SeuratWrappers)
library(batchelor)
library(tidyr)
library(scIntegrationMetrics)
library(patchwork)
library(tidytext)
seed = 1234
set.seed(seed)Load test datasets
A merged dataset is available as a Seurat object
download <- F
where <- 'aux'
dir.create(where, showWarnings = FALSE)
rds.path <- sprintf("%s/ref_TILAtlas_mouse_v1.rds", where)
if(download){
options(timeout=500)
url <- "https://figshare.com/ndownloader/files/23136746"
download.file(url = url, destfile = rds.path)
}
object <- readRDS(rds.path)
DefaultAssay(object) <- "RNA"Annotated subtypes and dataset/study are stored in
functional.cluster and Study metadata column,
respectively.
meta.batch <- "Study"
meta.label <- "functional.cluster"batchLabels.table <- table(object@meta.data[[meta.label]], object@meta.data[[meta.batch]])
batchLabels.table##
## Carmona Ekiz Magen_dLN Magen_TILs MC38_dLN Singer Xiong
## CD8_Tex 375 211 0 6 12 161 1328
## CD8_Tpex 207 26 0 33 26 53 419
## CD8_EffectorMemory 1700 75 0 10 64 89 482
## CD8_EarlyActiv 373 27 0 1 149 26 56
## CD8_NaiveLike 3492 61 0 0 1708 78 181
## CD4_NaiveLike 6 4 13 0 457 0 23
## Tfh 2 1 818 17 205 0 0
## Th1 50 63 70 477 614 2 384
## Treg 13 54 49 237 253 1 1561
For integration metrics, do not consider labels that are contributed
in more than thrMaxPerBatch (90%) by a single
dataset/batch. Additionally, at least minBatchesPerCellType
(2) datasets have to contribute with at least
thrMinPerBatch (5%) each. Then number of datasets over this
threshold can be used to calculate normalized LISI scores per label.
batchLabels <- round( batchLabels.table / rowSums(batchLabels.table) * 100)
batchLabels##
## Carmona Ekiz Magen_dLN Magen_TILs MC38_dLN Singer Xiong
## CD8_Tex 18 10 0 0 1 8 63
## CD8_Tpex 27 3 0 4 3 7 55
## CD8_EffectorMemory 70 3 0 0 3 4 20
## CD8_EarlyActiv 59 4 0 0 24 4 9
## CD8_NaiveLike 63 1 0 0 31 1 3
## CD4_NaiveLike 1 1 3 0 91 0 5
## Tfh 0 0 78 2 20 0 0
## Th1 3 4 4 29 37 0 23
## Treg 1 2 2 11 12 0 72
thrMaxPerBatch <- 90
minBatchesPerCellType <- 2
thrMinPerBatch <- 5bachesPerLabel <- apply(batchLabels,1,function(x) sum(x > thrMinPerBatch) )
removeLabels <- names(bachesPerLabel[bachesPerLabel < minBatchesPerCellType ])
removeLabels <- unique(removeLabels,
names(which(apply(
batchLabels, 1, function(x) max(x) > thrMaxPerBatch)
))) # contributed in more than 90% by a single dataset/batch
removeLabels## [1] "CD4_NaiveLike"
metricsLabels <- setdiff(unique(object@meta.data[[meta.label]]),removeLabels)Set critical parameters
nfeatures <- 2000 # number of highly variable genes for dimensionality reduction
ndim <- 20 # number of PCA components for dimensionality reduction
lisi_perplexity <- 30 # number of neighbors for LISI (see below for details) to measure batch mixingHow does the collection of datasets look without any integration?
Run a standard Seurat pipeline for dimensionality reduction.
library(data.table)
#The table of standard genes in Ensembl can be download from [this link](https://github.com/carmonalab/scRNAseq_data_processing/blob/master/aux/EnsemblGenes105_Mmu_GRCm39.txt.gz)
ensembleRef <- "aux/EnsemblGenes105_Mmu_GRCm39.txt.gz"
object@assays$RNA@counts <- object@assays$RNA@data
object <- STACAS:::standardizeGeneSymbols(obj = object, EnsemblGeneFile=ensembleRef)object <- FindVariableFeatures(object, nfeatures = nfeatures) %>%
ScaleData() %>%
RunPCA(npcs=ndim) %>%
RunUMAP(dims=1:ndim)integrationMetrics <- list()
useMetrics <- c("batch_LISI","batch_nLISI", "batch_nLISI_perCellType", "batch_nLISI_perCellType_means",
"1-celltype_nLISI", "1-celltype_nLISI_means", "celltype_ASW", "celltype_ASW_means")
method = "uncorrected"
method.reduction <- "pca"
metricsObject <- object
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)Set up plot parameters
plot.list <- list()
metricsShow.batch <- grep("batch",names(integrationMetrics[[1]]),value = T)
metricsShow.celltype <- grep("celltype",names(integrationMetrics[[1]]),value = T)plotIntegrationMetricsDim <- function(plot.list, metricsShow.batch, metricsShow.celltype, reduction="umap", plot=T){
metricsShow.batch.caption <- paste(metricsShow.batch,round(as.numeric(integrationMetrics[[method]][metricsShow.batch]),2),collapse = "\n ")
metricsShow.celltype.caption <- paste(metricsShow.celltype,round(as.numeric(integrationMetrics[[method]][metricsShow.celltype]),2),collapse = "\n ")
plot.list[[method]][["batch"]] <- DimPlot(metricsObject, group.by = meta.batch, reduction = reduction) + theme(aspect.ratio = 1) + labs(subtitle = "Dataset/batch", title = method, caption = metricsShow.batch.caption)
plot.list[[method]][["label"]] <- DimPlot(metricsObject, group.by = meta.label, label=T, label.size = 4, reduction = reduction) + theme(aspect.ratio = 1) + labs(subtitle = "Cell labels", title = method, caption = metricsShow.celltype.caption)
if(plot) print(plot.list[[method]][["batch"]] | plot.list[[method]][["label"]])
return(plot.list)
} plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)object.uncorrected <- objectClusters are largely driven by study/batch of origin rather than subtype
A simple approach to mitigate batch effects is to select HVGs that are consistently variable across datasets. For this we’ll split by dataset/batch, calculate HVG for each, then identify shared genes using Seurat’s SelectIntegrationFeatures.
obj.list <- SplitObject(object, split.by = meta.batch)for (i in 1:length(obj.list)) {
#obj.list[[i]] <- NormalizeData(obj.list[[i]],assay="RNA",normalization.method="LogNormalize") # these data are already pre-normalized
obj.list[[i]] <- FindVariableFeatures(obj.list[[i]], nfeatures=nfeatures*2)
}
hvg <- SelectIntegrationFeatures(obj.list, nfeatures = nfeatures)Alternatively, calculate HVG excluding specific gene sets using FindVariableFeatures.STACAS function (default STACAS’ behaviour)
library(SignatuR)
my.genes.blocklist <- GetSignature(SignatuR$Hs)
for (i in 1:length(obj.list)) {
#obj.list[[i]] <- NormalizeData(obj.list[[i]],assay="RNA",normalization.method="LogNormalize")
obj.list[[i]] <- FindVariableFeatures.STACAS(obj.list[[i]], nfeat=nfeatures*2, genesBlockList = my.genes.blocklist)
}
hvg <- SelectIntegrationFeatures(obj.list, nfeatures = nfeatures)Re-calculate dimensionality reduction using hvg
object@assays$RNA@var.features <- hvg
object <- ScaleData(object) %>% RunPCA(npcs=ndim) %>% RunUMAP(dims=1:ndim)method = "sharedFeatures"
method.reduction <- "pca"
metricsObject <- object
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)STACAS integration
object_integrated_stacas <- Run.STACAS(obj.list, dims = 1:ndim, anchor.features = hvg) %>%
RunUMAP(dims = 1:ndim) method = "STACAS"
method.reduction <- "pca"
metricsObject <- object_integrated_stacas
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)Semi-supervised STACAS integration
When available, cell type annotations can be used to guide the alignment. STACAS will use this information to penalize anchors where cell types are inconsistent.
library(scGate)
my.genes.blocklist <- scGate::genes.blacklist.default$Mm
CD8T <- scGate::gating_model(name="CD8T", signature=c("Cd8a","Cd8b1"))
CD4T <- scGate::gating_model(name="CD4T", signature=c("Cd4","Cd40lg"))
models <- list("CD8T"=CD8T, "CD4T"=CD4T)
#CD4Tconv <- scGate::gating_model(name="CD4Tconv", signature=c("Cd4","Cd40lg","Foxp3-"))
#Treg <- scGate::gating_model(name="Treg", signature=c("Foxp3+"))
#models <- list("CD8T"=CD8T, "CD4Tconv"=CD4Tconv, "Treg"=Treg)
obj.list <- lapply(obj.list, function(x) {
x <- FindVariableFeatures.STACAS(x, nfeat=1000, genesBlockList = my.genes.blocklist)
x <- ScaleData(x) |> RunPCA(pcs=20) |> RunUMAP(dims=1:20)
scGate(x, model=models, multi.asNA=TRUE, reduction="pca")
})lapply(names(obj.list), function(n) {
x <- obj.list[[n]]
a <- DimPlot(x, group.by="scGate_multi") + theme(aspect.ratio = 1) + ggtitle(n)
b <- FeaturePlot(x, features=c("Cd8a","Cd4","Cd40lg","Foxp3"))
a | b
})## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
Same gating model on uncorrected dataset
object.uncorrected <- scGate(object.uncorrected, model=models, multi.asNA=TRUE, reduction="pca")##
## ### Detected a total of 4019 non-pure cells for CD8T (23.92% of total)
##
## ### Detected a total of 13760 non-pure cells for CD4T (81.89% of total)
DimPlot(object.uncorrected, group.by="scGate_multi") + theme(aspect.ratio = 1)ggsave("CD4_CD8_scGate_uncorrected_space.png", height=4.5, width=5)Here we indicate in cell.labels the metadata column that
contains cell annotations. We can generate simple annotations using
e.g. scGate:
object_integrated_ss <- obj.list %>% Run.STACAS(dims = 1:ndim, anchor.features = hvg, cell.labels = "scGate_multi")Note that there is no need for ALL cells to be annotated: we
recommend to set labels to NA or unknown for cells
that cannot be confidently annotated, and they won’t be penalized for
label inconsistency. In addition, you can decide how much weight to give
to cell labels with the label.confidence parameter (from 0
to 1).
Visualize on UMAP space
object_integrated_ss <- object_integrated_ss %>% RunUMAP(dims=1:ndim)method = "semisupSTACAS"
method.reduction <- "pca"
metricsObject <- object_integrated_ss
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)Semi-supervised STACAS with full annotations
When available, cell type annotations can be used to guide the alignment. STACAS will use this information to penalize anchors where cell types are inconsistent.
Here we indicate in cell.labels the metadata column that
contains cell annotations. We can generate simple annotations using
e.g. scGate:
object_integrated_ss_full <- obj.list %>% Run.STACAS(dims = 1:ndim, anchor.features = hvg, cell.labels = meta.label)Note that there is no need for ALL cells to be annotated: we
recommend to set labels to NA or unknown for cells
that cannot be confidently annotated, and they won’t be penalized for
label inconsistency. In addition, you can decide how much weight to give
to cell labels with the label.confidence parameter (from 0
to 1).
Visualize on UMAP space
object_integrated_ss_full <- object_integrated_ss_full %>% RunUMAP(dims=1:ndim)method = "semisupSTACAS_full"
method.reduction <- "pca"
metricsObject <- object_integrated_ss_full
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject,
metrics = useMetrics, meta.label = meta.label,
meta.batch = meta.batch, lisi_perplexity = lisi_perplexity,
method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)Seurat Integration - CCA method
# find anchors
obj.list.seuratAnchors <- FindIntegrationAnchors(obj.list, anchor.features = hvg, reduction = "cca", dims = 1:ndim)
# integrate data
object_integrated_cca <- IntegrateData(anchorset = obj.list.seuratAnchors, dims=1:ndim)
rm(obj.list.seuratAnchors)
#Visualize on UMAP space
object_integrated_cca <- object_integrated_cca %>% ScaleData() %>% RunPCA(npcs=ndim) %>% RunUMAP(dims=1:ndim)
method = "SEURAT_cca"
method.reduction <- "pca"
metricsObject <- object_integrated_cca
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)
plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)Seurat Integration - RPCA method
# find anchors
obj.list.seuratAnchors <- FindIntegrationAnchors(obj.list, anchor.features = hvg, reduction = "rpca", dims = 1:ndim)
# integrate data
object_integrated_rpca <- IntegrateData(anchorset = obj.list.seuratAnchors, dims=1:ndim)
rm(obj.list.seuratAnchors)Visualize on UMAP space
object_integrated_rpca <- object_integrated_rpca %>% ScaleData() %>% RunPCA(npcs=ndim) %>% RunUMAP(dims=1:ndim)method = "SEURAT_rpca"
method.reduction <- "pca"
metricsObject <- object_integrated_rpca
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)Harmony Integration
object_integrated_harmony <- RunHarmony(object, group.by.vars = meta.batch) Visualize on UMAP space
object_integrated_harmony <- RunUMAP(object_integrated_harmony, reduction = "harmony", dims=1:ndim)method = "Harmony"
method.reduction <- "harmony"
metricsObject <- object_integrated_harmony
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)FastMNN Integration
object_integrated_FastMNN <- RunFastMNN(object.list = obj.list, features = hvg, assay="RNA", d=ndim)Visualize on UMAP space
object_integrated_FastMNN <- RunUMAP(object_integrated_FastMNN, reduction = "mnn", dims=1:ndim)method = "FastMNN"
method.reduction <- "mnn"
metricsObject <- object_integrated_FastMNN
integrationMetrics[[method]] <- getIntegrationMetrics(object=metricsObject, metrics = useMetrics, meta.label = meta.label, meta.batch = meta.batch, lisi_perplexity = lisi_perplexity, method.reduction = method.reduction, metricsLabels = metricsLabels)plot.list <- plotIntegrationMetricsDim(plot.list,metricsShow.batch,metricsShow.celltype)Summary of Integration Metrics
Include celltypeLISI (without 1-)
integrationMetrics <- lapply(integrationMetrics, function(x) {
x$celltype_nLISI <- 1 - x$`1-celltype_nLISI`
x$celltype_nLISI_means <- 1 - x$`1-celltype_nLISI_means`
x
})integrationMetricsSummary <- data.frame(unlist(integrationMetrics)) %>% tibble::rownames_to_column() %>% dplyr::rename(value=unlist.integrationMetrics.) %>% separate(rowname, c("Method","Metric"), sep="\\.") #\\.[[:alpha:]]
ggplot(integrationMetricsSummary,aes(x=reorder_within(Method,-value, Metric), y=value, fill=Method)) + geom_bar(stat="identity") +
theme_bw() +
theme(legend.position="none", axis.text.x=element_text(angle = 90, vjust = 0.5, hjust=1)) + xlab("Dataset") + facet_wrap(~Metric, scales = "free", ncol = 5)ggsave("STACAS.compareMetrics.Tcells.summary.png", width = 16, height = 14)a <- integrationMetricsSummary %>% filter(Metric %in% c("celltype_ASW_means","batch_nLISI_perCellType_means")) %>%
pivot_wider(names_from = Metric, values_from = value ) %>%
ggplot(aes(x=batch_nLISI_perCellType_means, y=celltype_ASW_means, label=Method)) +
geom_point(aes(color=Method)) + geom_text(hjust=0.1, vjust=0) + theme_light()
b <- integrationMetricsSummary %>% filter(Metric %in% c("batch_nLISI_perCellType_means","celltype_nLISI_means")) %>%
pivot_wider(names_from = Metric, values_from = value ) %>%
ggplot(aes(x=batch_nLISI_perCellType_means, y=`celltype_nLISI_means`, label=Method)) +
geom_point(aes(color=Method)) + geom_text(hjust=0.1, vjust=0) + theme_light()
c <- integrationMetricsSummary %>% filter(Metric %in% c("celltype_ASW_means","celltype_nLISI_means")) %>%
pivot_wider(names_from = Metric, values_from = value ) %>%
ggplot(aes(x=`celltype_nLISI_means`, y=celltype_ASW_means, label=Method)) +
geom_point(aes(color=Method)) + geom_text(hjust=0.1, vjust=0) + theme_light()
a | b | cggsave("STACAS.compareMetrics.Tcells.scatter.png", plot = b, width = 5.5, height = 3.5)
ggsave("STACAS.compareMetrics.Tcells.scatter.pdf", plot = b, width = 5.5, height = 3.5)p <- wrap_plots(lapply(plot.list,wrap_plots),ncol=2)
pggsave("STACAS.compareMetrics.Tcells.umaps.png",p, width = 30, height = 30)Further reading
The STACAS package and installation instructions are available at: STACAS package
The code for this demo can be found on GitHub
References
Andreatta A., Carmona S. J. (2021). STACAS: Sub-Type Anchor Correction for Alignment in Seurat to integrate single-cell RNA-seq data. - Bioinformatics
Luecken, M. D., Büttner, M., Chaichoompu, K., Danese, A., Interlandi, M., Müller, M. F., … & Theis, F. J. (2022). Benchmarking atlas-level data integration in single-cell genomics. - Nature methods
Hao, Y., Hao, S., Andersen-Nissen, E., Mauck III, W. M., Zheng, S., Butler, A., … & Satija, R. (2021). Integrated analysis of multimodal single-cell data. - Cell